##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: viridisLite
library(stringr)
casestudy2 = read.csv(".//CaseStudy2-data.csv") #casestudy2-data.csv
#Attrition by Department
ggplot(casestudy2, aes(x=as.factor(Department), fill=Attrition))+
geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..]), position="stack" , width=0.5) +
geom_text(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..], label=scales::percent(..count../tapply(..count.., ..x.. ,sum)[..x..]) ),
stat="count", position=position_stack(0.9), vjust=0.5)+
xlab('Department') +
ylab('Percent of Attrition')+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
theme(axis.text = element_text(size = 7))casestudy2$Attritioncalc=case_when(
casestudy2$Attrition =='Yes' ~ 1,
TRUE ~ 0
)
#summary
er<-casestudy2 %>% group_by(Department) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))
er ## # A tibble: 3 × 4
## Department meanincome calcAttrition Employees
## <chr> <dbl> <dbl> <int>
## 1 Research & Development 6173. 0.133 562
## 2 Sales 6789. 0.216 273
## 3 Human Resources 6776. 0.171 35
#Attrition % by Job Role and Department
#summary table
er<-casestudy2 %>% group_by(Department, JobRole) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))## `summarise()` has grouped output by 'Department'. You can override using the `.groups` argument.
## # A tibble: 11 × 5
## # Groups: Department [3]
## Department JobRole meanincome calcAttrition Employees
## <chr> <chr> <dbl> <dbl> <int>
## 1 Sales Sales Executive 6892. 0.165 200
## 2 Research & Development Research Scientist 3259. 0.186 172
## 3 Research & Development Laboratory Technic… 3222. 0.196 153
## 4 Research & Development Manufacturing Dire… 7505. 0.0230 87
## 5 Research & Development Healthcare Represe… 7435. 0.105 76
## 6 Sales Sales Representati… 2653. 0.453 53
## 7 Research & Development Research Director 15750. 0.0196 51
## 8 Human Resources Human Resources 3285. 0.222 27
## 9 Research & Development Manager 17139. 0.0870 23
## 10 Sales Manager 16719. 0.1 20
## 11 Human Resources Manager 18560 0 8
#graph
ggplot() +
geom_polygon(data = er, aes(x=Department, y = JobRole),color = "white", fill="grey", alpha=0.5) +
geom_point( data=er, aes(x=Department, y=JobRole, color=Employees, size=calcAttrition, alpha=0.5)) +
scale_color_viridis(option="viridis", name="Employees" ) +
scale_size(range = c(1, 10), name="Attrition %")+
ggtitle("Attrition by Role")#Attrition % by Job Level and Department
#summary
er<-casestudy2 %>% group_by(Department, JobLevel) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))## `summarise()` has grouped output by 'Department'. You can override using the `.groups` argument.
## # A tibble: 15 × 5
## # Groups: Department [3]
## Department JobLevel meanincome calcAttrition Employees
## <chr> <int> <dbl> <dbl> <int>
## 1 Research & Development 1 2793. 0.219 256
## 2 Research & Development 2 5435. 0.0542 166
## 3 Sales 2 5678. 0.146 144
## 4 Research & Development 3 10248. 0.0909 77
## 5 Sales 3 9331. 0.189 53
## 6 Sales 1 2519. 0.48 50
## 7 Research & Development 4 15374. 0.0256 39
## 8 Research & Development 5 19304. 0.0833 24
## 9 Human Resources 1 2691. 0.261 23
## 10 Sales 4 14863. 0.105 19
## 11 Sales 5 18965. 0.286 7
## 12 Human Resources 5 19207. 0 6
## 13 Human Resources 2 4982. 0 2
## 14 Human Resources 3 8412. 0 2
## 15 Human Resources 4 16618 0 2
#graph
ggplot() +
geom_point( data=er, aes(x=Department, y=JobLevel, color=Employees, size=calcAttrition, alpha=0.5)) +
scale_color_viridis(option="viridis", name="Employees" ) +
scale_size(range = c(1, 10), name="Attrition %")+
ggtitle("Attrition by Job Level")#summary
er<-casestudy2 %>% group_by(Department, JobLevel) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))## `summarise()` has grouped output by 'Department'. You can override using the `.groups` argument.
## # A tibble: 15 × 5
## # Groups: Department [3]
## Department JobLevel meanincome calcAttrition Employees
## <chr> <int> <dbl> <dbl> <int>
## 1 Research & Development 1 2793. 0.219 256
## 2 Research & Development 2 5435. 0.0542 166
## 3 Sales 2 5678. 0.146 144
## 4 Research & Development 3 10248. 0.0909 77
## 5 Sales 3 9331. 0.189 53
## 6 Sales 1 2519. 0.48 50
## 7 Research & Development 4 15374. 0.0256 39
## 8 Research & Development 5 19304. 0.0833 24
## 9 Human Resources 1 2691. 0.261 23
## 10 Sales 4 14863. 0.105 19
## 11 Sales 5 18965. 0.286 7
## 12 Human Resources 5 19207. 0 6
## 13 Human Resources 2 4982. 0 2
## 14 Human Resources 3 8412. 0 2
## 15 Human Resources 4 16618 0 2
#graph
ggplot() +
geom_point( data=er, aes(x=Department, y=JobLevel, color=Employees, size=calcAttrition, alpha=0.5)) +
scale_color_viridis(option="viridis", name="Employees" ) +
scale_size(range = c(1, 10), name="Attrition %")+
ggtitle("Attrition by Job Level") ### By Department ##### The Sales Department has the highest % of Attrition (21.6%) ##### 54% of Attrition comes from Research and Development, but Research and Development is the largest Department.
# k-NN or naive Bayes but may also use other models (logistic regression, random forest, LDA, SVM, etc)
#as long as you compare the results between the two or more models.
#You may then use any of the models to fulfill the 60/60 sensitivity/specificity requirement.
#This goes for regression as well; you must use linear regression but may include additional models for comparison and use in the competition (LASSO, random forest, ensemble models, etc.).
casestudy2.noattrition = read.csv(".//CaseStudy2CompSet No Salary.csv")
library(class)
library(caret)## Loading required package: lattice
library(e1071)
clean_casestudy2 = data.frame(
Attrition=casestudy2$Attrition,
Age = scale(casestudy2$Age),
JobInvolvement=scale(casestudy2$JobInvolvement),
JobLevel=scale(casestudy2$JobLevel),
Distance=scale(casestudy2$DistanceFromHome),
StockOptionLevel=scale(casestudy2$StockOptionLevel),
EnvironmentSatisfaction=scale(casestudy2$EnvironmentSatisfaction),
RelationshipSatisfaction=scale(casestudy2$RelationshipSatisfaction),
JobSatisfaction=scale(casestudy2$JobSatisfaction),
YearSinceLastPromotion=scale(casestudy2$YearsSinceLastPromotion),
YearsinCurrentRole=scale(casestudy2$YearsInCurrentRole),
Education=scale(casestudy2$Education)
)
casestudy2.noatt= data.frame(
Attrition=casestudy2.noattrition$Attrition,
Age = scale(casestudy2.noattrition$Age),
JobInvolvement=scale(casestudy2.noattrition$JobInvolvement),
JobLevel=scale(casestudy2.noattrition$JobLevel),
Distance=scale(casestudy2.noattrition$DistanceFromHome),
StockOptionLevel=scale(casestudy2.noattrition$StockOptionLevel),
EnvironmentSatisfaction=scale(casestudy2.noattrition$EnvironmentSatisfaction),
RelationshipSatisfaction=scale(casestudy2.noattrition$RelationshipSatisfaction),
JobSatisfaction=scale(casestudy2.noattrition$JobSatisfaction),
YearSinceLastPromotion=scale(casestudy2.noattrition$YearsSinceLastPromotion),
YearsinCurrentRole=scale(casestudy2.noattrition$YearsInCurrentRole),
Education=scale(casestudy2.noattrition$Education))
test= casestudy2.noatt
train=clean_casestudy2
#confusion matrix results for each row
numks = 30
masterAcc = matrix(nrow = numks)
masterSens = matrix(nrow = numks)
masterSpec = matrix(nrow = numks)
masterK = matrix(nrow = numks)
data(attrition)## Warning in data(attrition): data set 'attrition' not found
set.seed(1)
i=1
for(i in 1:numks)
{
classifications = knn(train[,c(2:12)],test[,c(2:12)],as.factor(train$Attrition), prob = TRUE, k = i)
#results for accuracty, sensitivity, and specificity
table(as.factor(test$Attrition),classifications)
CM = confusionMatrix(table(as.factor(test$Attrition),classifications))
masterAcc[i] = CM$overall[1]
masterSens[i]=CM[["byClass"]][["Sensitivity"]][1]
masterSpec[i]=CM[["byClass"]][["Specificity"]][1]
masterK[i]=i
}
Overall=cbind(masterAcc,masterSens, masterSpec, masterK)
Overall=as.data.frame(Overall)
#renaming CM column results
Overall=rename(Overall, Accuracy=V1, Sensitivity=V2, Specificity=V3,K=V4)
MeanAcc = colMeans(Overall)
MeanAcc## Accuracy Sensitivity Specificity K
## 0.8191111 0.8356097 0.3526761 15.5000000
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ purrr 0.3.4
## ✓ tidyr 1.1.4 ✓ forcats 0.5.1
## ✓ readr 2.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
df <- Overall %>%
select(K, Accuracy, Sensitivity, Specificity) %>%
gather(key = "variable", value = "value", -K)
#graph of knn results
ggplot(df, aes(x = K, y = value)) +
geom_line(aes(color = variable, linetype = variable)) +
scale_color_manual(values = c("black", "steelblue", "blue"))+ geom_vline(xintercept=16,color="darkgreen")+ ylab('Confusion Matrix Measurments')+ggtitle("KNN Results")##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + YearsInCurrentRole, data = casestudy2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4981.4 -928.0 71.8 693.6 3751.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1769.71 104.03 -17.012 <2e-16 ***
## JobLevel 4034.21 47.78 84.425 <2e-16 ***
## YearsInCurrentRole -15.72 14.31 -1.099 0.272
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1413 on 867 degrees of freedom
## Multiple R-squared: 0.9057, Adjusted R-squared: 0.9055
## F-statistic: 4166 on 2 and 867 DF, p-value: < 2.2e-16
## 2.5 % 97.5 %
## (Intercept) -1973.88338 -1565.52861
## JobLevel 3940.42571 4128.00029
## YearsInCurrentRole -43.81716 12.36847
## `summarise()` has grouped output by 'JobLevel'. You can override using the `.groups` argument.
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## Warning: 'surface' objects don't have these attributes: 'mode'
## Valid attributes include:
## '_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: measures
##
## Attaching package: 'measures'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## Loading required package: party
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
##
## Attaching package: 'varImp'
## The following object is masked from 'package:caret':
##
## varImp